import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
dataset=pd.read_csv("Main_proj_sale_dataset.csv")
dataset.head()
| index | Order ID | Date | Status | Fulfilment | Sales Channel | ship-service-level | Style | SKU | Category | ... | currency | Amount | ship-city | ship-state | ship-postal-code | ship-country | promotion-ids | B2B | fulfilled-by | Unnamed: 22 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 405-8078784-5731545 | 04-30-22 | Cancelled | Merchant | Amazon.in | Standard | SET389 | SET389-KR-NP-S | Set | ... | INR | 647.62 | MUMBAI | MAHARASHTRA | 400081.0 | IN | NaN | False | Easy Ship | NaN |
| 1 | 1 | 171-9198151-1101146 | 04-30-22 | Shipped - Delivered to Buyer | Merchant | Amazon.in | Standard | JNE3781 | JNE3781-KR-XXXL | kurta | ... | INR | 406.00 | BENGALURU | KARNATAKA | 560085.0 | IN | Amazon PLCC Free-Financing Universal Merchant ... | False | Easy Ship | NaN |
| 2 | 2 | 404-0687676-7273146 | 04-30-22 | Shipped | Amazon | Amazon.in | Expedited | JNE3371 | JNE3371-KR-XL | kurta | ... | INR | 329.00 | NAVI MUMBAI | MAHARASHTRA | 410210.0 | IN | IN Core Free Shipping 2015/04/08 23-48-5-108 | True | NaN | NaN |
| 3 | 3 | 403-9615377-8133951 | 04-30-22 | Cancelled | Merchant | Amazon.in | Standard | J0341 | J0341-DR-L | Western Dress | ... | INR | 753.33 | PUDUCHERRY | PUDUCHERRY | 605008.0 | IN | NaN | False | Easy Ship | NaN |
| 4 | 4 | 407-1069790-7240320 | 04-30-22 | Shipped | Amazon | Amazon.in | Expedited | JNE3671 | JNE3671-TU-XXXL | Top | ... | INR | 574.00 | CHENNAI | TAMIL NADU | 600073.0 | IN | NaN | False | NaN | NaN |
5 rows × 24 columns
dataset.columns.tolist()
['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids', 'B2B', 'fulfilled-by', 'Unnamed: 22']
print("Rows=",dataset.shape[0])
print("Columns=",dataset.shape[1])
Rows= 128975 Columns= 24
dataset.isnull().sum()
index 0 Order ID 0 Date 0 Status 0 Fulfilment 0 Sales Channel 0 ship-service-level 0 Style 0 SKU 0 Category 0 Size 0 ASIN 0 Courier Status 6872 Qty 0 currency 7795 Amount 7795 ship-city 33 ship-state 33 ship-postal-code 33 ship-country 33 promotion-ids 49153 B2B 0 fulfilled-by 89698 Unnamed: 22 49050 dtype: int64
# calculating the percentage of null values
dataset.isnull().sum() / len(dataset) * 100
index 0.000000 Order ID 0.000000 Date 0.000000 Status 0.000000 Fulfilment 0.000000 Sales Channel 0.000000 ship-service-level 0.000000 Style 0.000000 SKU 0.000000 Category 0.000000 Size 0.000000 ASIN 0.000000 Courier Status 5.328164 Qty 0.000000 currency 6.043807 Amount 6.043807 ship-city 0.025586 ship-state 0.025586 ship-postal-code 0.025586 ship-country 0.025586 promotion-ids 38.110487 B2B 0.000000 fulfilled-by 69.546811 Unnamed: 22 38.030626 dtype: float64
# Low null values (<5%):
# Columns: ship-city, ship-state, ship-postal-code, ship-country.
# categorical columns ship-city, ship-state, ship-postal-code, ship-country.
ds_copy=dataset.copy()
ds_copy[['ship-city', 'ship-state', 'ship-postal-code', 'ship-country']].head(5)
| ship-city | ship-state | ship-postal-code | ship-country | |
|---|---|---|---|---|
| 0 | MUMBAI | MAHARASHTRA | 400081.0 | IN |
| 1 | BENGALURU | KARNATAKA | 560085.0 | IN |
| 2 | NAVI MUMBAI | MAHARASHTRA | 410210.0 | IN |
| 3 | PUDUCHERRY | PUDUCHERRY | 605008.0 | IN |
| 4 | CHENNAI | TAMIL NADU | 600073.0 | IN |
# ship-city column Action: Impute using mode
# Find mode of the column
city_mode=ds_copy['ship-city'].mode()[0]
print("mode value for ship-city column is :",city_mode)
# Fill missing values with the mode
ds_copy['ship-city']=ds_copy['ship-city'].fillna(city_mode)
# Verify if nulls remain
ds_copy['ship-city'].isnull().sum()
mode value for ship-city column is : BENGALURU
0
# ship-state column Action: Impute using mode
# Find mode of the column
ship_mode=ds_copy['ship-state'].mode()[0]
print("mode value for ship-state column is :",ship_mode)
# Fill missing values with the mode
ds_copy['ship-state']=ds_copy['ship-state'].fillna(ship_mode)
# Verify if nulls remain
ds_copy['ship-state'].isnull().sum()
mode value for ship-state column is : MAHARASHTRA
0
# ship-postal-code Action: Impute using mode
# Find mode of the column
postal_mode=ds_copy['ship-postal-code'].mode()[0]
print("mode value for ship-postal-code column is :",postal_mode,type(postal_mode))
# Fill missing values with the mode
ds_copy['ship-postal-code']=ds_copy['ship-postal-code'].fillna(postal_mode)
# Verify if nulls remain
ds_copy['ship-postal-code'].isnull().sum()
mode value for ship-postal-code column is : 201301.0 <class 'numpy.float64'>
0
# Remove the decimal part in ship-postal-code column
ds_copy['ship-postal-code']=(ds_copy['ship-postal-code']).astype(int)
ds_copy['ship-postal-code'].mode()[0]
201301
# ship-country Action: Impute using mode
# Find mode of the column
country_mode=ds_copy['ship-country'].mode()[0]
print("mode value for ship-country column is :",country_mode)
# Fill missing values with the mode
ds_copy['ship-country']=ds_copy['ship-country'].fillna(country_mode)
# Verify if nulls remain
ds_copy['ship-country'].isnull().sum()
mode value for ship-country column is : IN
0
# Moderate null values (5%-20%):
# columns: Courier Status, currency, Amount.
ds_copy[['Courier Status', 'currency', 'Amount']].head(5)
| Courier Status | currency | Amount | |
|---|---|---|---|
| 0 | NaN | INR | 647.62 |
| 1 | Shipped | INR | 406.00 |
| 2 | Shipped | INR | 329.00 |
| 3 | NaN | INR | 753.33 |
| 4 | Shipped | INR | 574.00 |
ds_copy['Courier Status'].unique().tolist()
[nan, 'Shipped', 'Cancelled', 'Unshipped']
ds_copy['Status'].unique().tolist()
['Cancelled', 'Shipped - Delivered to Buyer', 'Shipped', 'Shipped - Returned to Seller', 'Shipped - Rejected by Buyer', 'Shipped - Lost in Transit', 'Shipped - Out for Delivery', 'Shipped - Returning to Seller', 'Shipped - Picked Up', 'Pending', 'Pending - Waiting for Pick Up', 'Shipped - Damaged', 'Shipping']
# Filter rows where 'Courier Status' is NaN and select the relevant columns
only_nulls = ds_copy[ds_copy['Courier Status'].isna()][['Status', 'Courier Status']]
# Display the first 5 rows of the filtered data
print("The values of status column and Courier status column ",only_nulls.head(5))
# Count rows with 'Cancelled' in the 'Status' column
only_nulls_cn= only_nulls[only_nulls['Status'] == 'Cancelled' ][['Courier Status', 'Status']]
print("No of rows has cancelled value in Status column",only_nulls_cn.shape[0])
# Count rows with null values in the 'Courier Status' column
print("No of rows has Null value in Courier status column",ds_copy['Courier Status'].isnull().sum())
# Done this step to avoid invalid imputation
# Both values are equal so Courier Status column action is to impute the null values as cancelled value
# Impute missing values in 'Courier Status' with 'Cancelled'
ds_copy['Courier Status']=ds_copy['Courier Status'].fillna('Cancelled')
# Verify if nulls remain
print("Null values count in Courier status column",ds_copy['Courier Status'].isnull().sum())
The values of status column and Courier status column Status Courier Status 0 Cancelled NaN 3 Cancelled NaN 23 Cancelled NaN 29 Cancelled NaN 83 Cancelled NaN No of rows has cancelled value in Status column 6861 No of rows has Null value in Courier status column 6872 Null values count in Courier status column 0
ds_copy['currency'].isnull().sum()
7795
# High null values (>50%): promotion-ids, fulfilled-by, Unnamed: 22.
# Unnamed: 22. is unnesscary colun so it is dropped
ds_copy.drop('Unnamed: 22',axis='columns',inplace=True)
ds_copy.columns.tolist()
['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids', 'B2B', 'fulfilled-by']
print(ds_copy['promotion-ids'].mode()[0])
print(ds_copy['fulfilled-by'].mode()[0])
IN Core Free Shipping 2015/04/08 23-48-5-108 Easy Ship
# promotion-ids column Action: Impute using mode
# Find mode of the column
promo_mode=ds_copy['promotion-ids'].mode()[0]
print("mode value for promotion-ids column is :",promo_mode)
# Fill missing values with the mode
ds_copy['promotion-ids']=ds_copy['promotion-ids'].fillna(promo_mode)
# Verify if nulls remain
ds_copy['promotion-ids'].isnull().sum()
mode value for promotion-ids column is : IN Core Free Shipping 2015/04/08 23-48-5-108
0
# fulfilled-by column Action: Impute using mode
# Find mode of the column
ful_mode=ds_copy['fulfilled-by'].mode()[0]
print("mode value for fulfilled-by column is :",ful_mode)
# Fill missing values with the mode
ds_copy['fulfilled-by']=ds_copy['fulfilled-by'].fillna(ful_mode)
# Verify if nulls remain
ds_copy['fulfilled-by'].isnull().sum()
mode value for fulfilled-by column is : Easy Ship
0
ds_copy.isnull().sum()
index 0 Order ID 0 Date 0 Status 0 Fulfilment 0 Sales Channel 0 ship-service-level 0 Style 0 SKU 0 Category 0 Size 0 ASIN 0 Courier Status 0 Qty 0 currency 7795 Amount 7795 ship-city 0 ship-state 0 ship-postal-code 0 ship-country 0 promotion-ids 0 B2B 0 fulfilled-by 0 dtype: int64
only_nulls = ds_copy[ds_copy['Amount'].isna()][['Status', 'Qty' , 'Amount']]
only_nulls.head(5)
| Status | Qty | Amount | |
|---|---|---|---|
| 8 | Cancelled | 0 | NaN |
| 29 | Cancelled | 0 | NaN |
| 65 | Cancelled | 0 | NaN |
| 84 | Cancelled | 0 | NaN |
| 95 | Cancelled | 0 | NaN |
# Filter rows where 'Amount' is NaN and select the relevant columns
only_nulls = ds_copy[ds_copy['Amount'].isna()][['Courier Status', 'Qty' , 'Amount']]
# Display the first 5 rows of the filtered data
print("The values of Amt,Qty,courier Status column ",only_nulls.head(5))
only_nulls_st= only_nulls[only_nulls['Courier Status'] == 'Cancelled' ][['Courier Status', 'Qty' , 'Amount']]
print("No of rows has cancelled value in courier status column",only_nulls_st.shape[0])
only_nulls_un= only_nulls[only_nulls['Courier Status'] == 'Unshipped' ][['Courier Status', 'Qty' , 'Amount']]
print("The values of Amt,Qty,Status column ",only_nulls_un.head(5))
print("No of rows has cancelled value in courier status column",only_nulls_un.shape[0])
# Done this step to avoid invalid imputation
# Both courier status values are equal to null in amount column so amount column action is to impute zero
# Impute missing values in 'Amount' with zero
ds_copy['Amount']=ds_copy['Amount'].fillna(0)
# Verify if nulls remain
print("Null values count in Amount column",ds_copy['Amount'].isnull().sum())
The values of Amt,Qty,courier Status column Courier Status Qty Amount 8 Cancelled 0 NaN 29 Cancelled 0 NaN 65 Cancelled 0 NaN 84 Cancelled 0 NaN 95 Cancelled 0 NaN No of rows has cancelled value in courier status column 7671 The values of Amt,Qty,Status column Courier Status Qty Amount 937 Unshipped 15 NaN 1057 Unshipped 2 NaN 3631 Unshipped 9 NaN 3632 Unshipped 1 NaN 3634 Unshipped 1 NaN No of rows has cancelled value in courier status column 124 Null values count in Amount column 0
#check the unique values
print("unique values of Style:",len(ds_copy['Style'].unique().tolist()))
print("unique values of SKU:",len(ds_copy['SKU'].unique().tolist()))
print("unique values of ASIN:",len(ds_copy['ASIN'].unique().tolist()))
print("unique values of Courier Status:",len(ds_copy['Courier Status'].unique().tolist()))
print("unique values of B2B:",len(ds_copy['B2B'].unique().tolist()))
print("unique values of promotion-ids:",len(ds_copy['promotion-ids'].unique().tolist()))
print("unique values of size:",len(ds_copy['Size'].unique().tolist()))
unique values of Style: 1377 unique values of SKU: 7195 unique values of ASIN: 7190 unique values of Courier Status: 3 unique values of B2B: 2 unique values of promotion-ids: 5787 unique values of size: 11
ds_copy['Size'].unique().tolist()
['S', '3XL', 'XL', 'L', 'XXL', 'XS', '6XL', 'M', '4XL', '5XL', 'Free']
# Define size mapping for aggregation
size_mapping = {
'XS': 'Small',
'S': 'Small',
'M': 'Medium',
'L': 'Medium',
'XL': 'Medium',
'XXL': 'Large',
'3XL': 'Large',
'4XL': 'Large',
'5XL': 'Large',
'6XL': 'Large',
'Free': 'Large'
}
# Apply mapping to the Size column
ds_copy['Size_Aggregated'] = ds_copy['Size'].map(size_mapping)
# Verify the transformation
print(ds_copy[['Size', 'Size_Aggregated']].head())
Size Size_Aggregated 0 S Small 1 3XL Large 2 XL Medium 3 L Medium 4 3XL Large
ds_copy['Size_Aggregated'].unique()
array(['Small', 'Large', 'Medium'], dtype=object)
# Unnamed: 22 column is dropped
# ds_copy.drop('Unnamed: 22', axis=1,inplace=True)
ds_copy['Size_Aggregated'].value_counts()
Size_Aggregated Medium 65719 Large 35005 Small 28251 Name: count, dtype: int64
# promotion-ids column Action: Impute using mode
# Find mode of the column
ful_mode=ds_copy['promotion-ids'].mode()[0]
print("mode value for promotion-ids column is :",ful_mode)
# Fill missing values with the mode
ds_copy['promotion-ids']=ds_copy['promotion-ids'].fillna(ful_mode)
# Verify if nulls remain
ds_copy['promotion-ids'].isnull().sum()
mode value for promotion-ids column is : IN Core Free Shipping 2015/04/08 23-48-5-108
0
# check the unique value of promotion _ids
unique_promotion_ids = ds_copy['promotion-ids'].unique()
print(f"Number of unique promotion IDs: {len(unique_promotion_ids)}")
Number of unique promotion IDs: 5787
ds_copy['Promotion_IDs_Frequency'] = ds_copy['promotion-ids'].map(ds_copy['promotion-ids'].value_counts())
# Check results
print(ds_copy[['promotion-ids', 'Promotion_IDs_Frequency']].head())
promotion-ids Promotion_IDs_Frequency 0 IN Core Free Shipping 2015/04/08 23-48-5-108 95253 1 Amazon PLCC Free-Financing Universal Merchant ... 1 2 IN Core Free Shipping 2015/04/08 23-48-5-108 95253 3 IN Core Free Shipping 2015/04/08 23-48-5-108 95253 4 IN Core Free Shipping 2015/04/08 23-48-5-108 95253
sns.histplot(ds_copy['Promotion_IDs_Frequency'], bins=10, kde=True)
<Axes: xlabel='Promotion_IDs_Frequency', ylabel='Count'>
# Combine columns into one, converting each to a string
ds_copy['Ship_Location'] = (ds_copy['ship-city'].astype(str) + '_' +
ds_copy['ship-state'].astype(str) + '_' +
ds_copy['ship-postal-code'].astype(str) + '_' +
ds_copy['ship-country'].astype(str))
# Check results
print(ds_copy[['ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'Ship_Location']].head())
ship-city ship-state ship-postal-code ship-country \
0 MUMBAI MAHARASHTRA 400081 IN
1 BENGALURU KARNATAKA 560085 IN
2 NAVI MUMBAI MAHARASHTRA 410210 IN
3 PUDUCHERRY PUDUCHERRY 605008 IN
4 CHENNAI TAMIL NADU 600073 IN
Ship_Location
0 MUMBAI_MAHARASHTRA_400081_IN
1 BENGALURU_KARNATAKA_560085_IN
2 NAVI MUMBAI_MAHARASHTRA_410210_IN
3 PUDUCHERRY_PUDUCHERRY_605008_IN
4 CHENNAI_TAMIL NADU_600073_IN
# Frequency Encoding for Shipping_Location
ds_copy['Ship_Location_Frequency'] = ds_copy['Ship_Location'].map(ds_copy['Ship_Location'].value_counts())
# Check the result
print(ds_copy[['Ship_Location', 'Ship_Location_Frequency']].head())
Ship_Location Ship_Location_Frequency 0 MUMBAI_MAHARASHTRA_400081_IN 111 1 BENGALURU_KARNATAKA_560085_IN 156 2 NAVI MUMBAI_MAHARASHTRA_410210_IN 184 3 PUDUCHERRY_PUDUCHERRY_605008_IN 42 4 CHENNAI_TAMIL NADU_600073_IN 172
ds_copy.columns.tolist()
['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids', 'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency', 'Ship_Location', 'Ship_Location_Frequency']
ds_copy['Ship_Location_Frequency'].head()
0 111 1 156 2 184 3 42 4 172 Name: Ship_Location_Frequency, dtype: int64
# EDA PArt
sns.histplot(ds_copy['Amount'], kde=True, bins=30)
plt.title('Distribution of Amount (Prices)')
plt.show()
ds_copy['Log_Amount'] = np.log1p(ds_copy['Amount'])
sns.histplot(ds_copy['Log_Amount'], kde=True, bins=30)
plt.title('Log-Transformed Prices')
plt.show()
print(ds_copy[ds_copy['Amount'] <= 1].shape[0]) # Count of values <= 1
print(ds_copy.shape[0])
# Replace Zeros with a Small Constant Before Transformation
# Since log transformations cannot handle zeros (log(0) is undefined),
# replacing zeros with a small constant can stabilize the transformation:
10138 128975
# Add a small constant to handle zero values before log transformation
ds_copy['Log_Amount_Adjusted'] = np.log1p(ds_copy['Amount'] + 1e-6)
ds_copy['Log_Amount_Adjusted'].head()
0 6.474847 1 6.008813 2 5.799093 3 6.625830 4 6.354370 Name: Log_Amount_Adjusted, dtype: float64
ds_copy['Amount_Category'] = ds_copy['Amount'].apply(lambda x: 'Zero' if x == 0 else 'Non-Zero')
ds_copy['Amount_Category'].value_counts()
Amount_Category Non-Zero 118837 Zero 10138 Name: count, dtype: int64
sns.boxplot(x=ds_copy['Amount'])
plt.title('Boxplot of Amount')
plt.show()
# Winsorizing the outliers in amount
lower_bound = ds_copy['Amount'].quantile(0.05)
upper_bound = ds_copy['Amount'].quantile(0.95)
# Winsorize the 'Amount' column without using clip or lambda
ds_copy['Amount'] = ds_copy['Amount'].where(ds_copy['Amount'] >= lower_bound, lower_bound)
ds_copy['Amount'] = ds_copy['Amount'].where(ds_copy['Amount'] <= upper_bound, upper_bound)
sns.boxplot(x=ds_copy['Amount'])
plt.title('Boxplot of Amount')
plt.show()
# Analyze Amount_Category
# Separate analysis for Non-Zero and Zero categories in the Amount_Category column:
# Check the proportion:
ds_copy['Amount_Category'].value_counts(normalize=True) * 100
Amount_Category Non-Zero 92.139562 Zero 7.860438 Name: proportion, dtype: float64
# analyze the pattern
zero_category = ds_copy[ds_copy['Amount_Category'] == 'Zero']
non_zero_category = ds_copy[ds_copy['Amount_Category'] == 'Non-Zero']
# Check features correlated with zero amounts:
#zero_category.describe()
non_zero_category.describe()
| index | Qty | Amount | ship-postal-code | Promotion_IDs_Frequency | Ship_Location_Frequency | Log_Amount | Log_Amount_Adjusted | |
|---|---|---|---|---|---|---|---|---|
| count | 118837.000000 | 118837.000000 | 118837.000000 | 118837.000000 | 118837.000000 | 118837.000000 | 118837.000000 | 118837.000000 |
| mean | 64452.670229 | 0.960467 | 652.090141 | 464133.377803 | 68867.052745 | 81.768776 | 6.419550 | 6.419550 |
| std | 37211.968633 | 0.216387 | 243.646091 | 191142.408316 | 42609.965810 | 123.117205 | 0.388115 | 0.388115 |
| min | 0.000000 | 0.000000 | 199.000000 | 110001.000000 | 1.000000 | 1.000000 | 5.298317 | 5.298317 |
| 25% | 32260.000000 | 1.000000 | 458.000000 | 382449.000000 | 146.000000 | 8.000000 | 6.129050 | 6.129050 |
| 50% | 64410.000000 | 1.000000 | 622.000000 | 500034.000000 | 95253.000000 | 34.000000 | 6.434547 | 6.434547 |
| 75% | 96632.000000 | 1.000000 | 788.000000 | 600023.000000 | 95253.000000 | 98.000000 | 6.670766 | 6.670766 |
| max | 128974.000000 | 8.000000 | 1166.000000 | 989898.000000 | 95253.000000 | 821.000000 | 8.627840 | 8.627840 |
ds_copy['Ship_Location_Frequency']
0 111
1 156
2 184
3 42
4 172
...
128970 111
128971 148
128972 393
128973 1
128974 1
Name: Ship_Location_Frequency, Length: 128975, dtype: int64
# Date base feature
ds_copy['Date'] = pd.to_datetime(ds_copy['Date'])
# Extract date-based features
ds_copy['Year'] = ds_copy['Date'].dt.year
ds_copy['Month'] = ds_copy['Date'].dt.month
ds_copy['Day'] = ds_copy['Date'].dt.day
ds_copy['Day_of_Week'] = ds_copy['Date'].dt.dayofweek
ds_copy['Is_Weekend'] = ds_copy['Day_of_Week'].apply(lambda x: 1 if x >= 5 else 0)
print(ds_copy[['Year','Month','Day','Day_of_Week','Is_Weekend']].head())
Year Month Day Day_of_Week Is_Weekend 0 2022 4 30 5 1 1 2022 4 30 5 1 2 2022 4 30 5 1 3 2022 4 30 5 1 4 2022 4 30 5 1
# Split 'promotion-ids' by commas and count the number of promotions for each order
ds_copy['num_promotions'] = ds_copy['promotion-ids'].apply(lambda x: len(x.split(',')) if x != '' else 0)
# Display the result
print(ds_copy[['Order ID', 'promotion-ids', 'num_promotions']].head())
Order ID promotion-ids \ 0 405-8078784-5731545 IN Core Free Shipping 2015/04/08 23-48-5-108 1 171-9198151-1101146 Amazon PLCC Free-Financing Universal Merchant ... 2 404-0687676-7273146 IN Core Free Shipping 2015/04/08 23-48-5-108 3 403-9615377-8133951 IN Core Free Shipping 2015/04/08 23-48-5-108 4 407-1069790-7240320 IN Core Free Shipping 2015/04/08 23-48-5-108 num_promotions 0 1 1 25 2 1 3 1 4 1
ds_copy['High_Promotion'] = ds_copy['num_promotions'].apply(lambda x: 1 if x > 3 else 0)
ds_copy['High_Promotion'].head()
0 0 1 1 2 0 3 0 4 0 Name: High_Promotion, dtype: int64
# Group by 'Category' and calculate the total number of promotions for each category
category_promotions = ds_copy.groupby('Category')['num_promotions'].sum()
print(category_promotions)
Category Blouse 4492 Bottom 3529 Dupatta 3 Ethnic Dress 6163 Saree 677 Set 289997 Top 54070 Western Dress 133829 kurta 273435 Name: num_promotions, dtype: int64
# Group by 'ship-country' and calculate the total number of promotions for each country
country_promotions = ds_copy.groupby('ship-country')['num_promotions'].sum()
# Display the result
print(country_promotions)
ship-country IN 766195 Name: num_promotions, dtype: int64
# Remove rows where Qty is zero
ds_copy = ds_copy[ds_copy['Qty'] != 0]
ds_copy['Price_Per_Unit'] = ds_copy['Amount'] / ds_copy['Qty']
ds_copy['Price_Per_Unit'].head()
1 406.0 2 329.0 4 574.0 5 824.0 6 653.0 Name: Price_Per_Unit, dtype: float64
# Plot the distribution of the 'Price_Per_Unit'
sns.histplot(ds_copy['Price_Per_Unit'], kde=True)
plt.title('Distribution of Price Per Unit')
plt.show()
# Check correlation between Quantity and Price per Unit
correlation = ds_copy[['Qty', 'Price_Per_Unit']].corr()
print(correlation)
# A correlation of -0.036668 means that there is a very weak negative correlation between Quantity (Qty) and Price Per Unit.
# While there may be some slight trend where Price Per Unit decreases as Qty increases, it is not a strong or reliable
# relationship. This suggests that, in your dataset, quantity does not have a significant impact on the price per unit.
Qty Price_Per_Unit Qty 1.000000 -0.036668 Price_Per_Unit -0.036668 1.000000
correlation = ds_copy[['Qty', 'Amount']].corr()
print(correlation)
#The correlation of 0.051943 suggests that there is a very weak positive correlation between Quantity (Qty) and
# Amount in your dataset.
#While there is some slight tendency for Amount to increase as Quantity increases, the relationship is not
# strong enough to rely on it for making significant predictions or insights.
#
Qty Amount Qty 1.000000 0.051943 Amount 0.051943 1.000000
ds_copy.shape[0]
116168
# Visualizations for Relationships
# Amount vs Promotion_IDs_Frequency
fig1 = px.scatter(ds_copy, x="Promotion_IDs_Frequency", y="Amount", title="Amount vs. Promotion IDs Frequency")
fig1.show()
ds_copy.columns
Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency',
'Ship_Location', 'Ship_Location_Frequency', 'Log_Amount',
'Log_Amount_Adjusted', 'Amount_Category', 'Year', 'Month', 'Day',
'Day_of_Week', 'Is_Weekend', 'num_promotions', 'High_Promotion',
'Price_Per_Unit'],
dtype='object')
# Visualization 2: Amount vs. Promotion_IDs_Frequency
plt.figure(figsize=(8, 6))
sns.boxplot(data=ds_copy, x='Promotion_IDs_Frequency', y='Amount')
plt.title('Amount vs. Promotion IDs Frequency')
plt.show()
import plotly.express as px
fig1 = px.box(ds_copy, x="Promotion_IDs_Frequency", y="Amount", title="Amount vs. Promotion IDs Frequency")
fig1.show()
ds_copy['Ship_Location_Frequency'].value_counts()
Ship_Location_Frequency
1 7582
2 5554
3 3953
4 3095
5 2764
...
116 107
120 107
107 105
114 101
102 91
Name: count, Length: 210, dtype: int64
numerical_ds_copy = ds_copy.select_dtypes(include=['number'])
# Calculate the correlation matrix
correlation_matrix = numerical_ds_copy.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5, fmt=".2f")
plt.title("Correlation Matrix for Numerical Columns")
plt.show()
# normalization function to handle case variations and abbreviations
def normalize_state(state):
state = state.lower() # Convert to lowercase
# Define some common corrections
state_corrections = {
'delhi': 'delhi', 'new delhi': 'delhi', 'nd': 'delhi',
'goa': 'goa', 'puducherry': 'puducherry', 'pondicherry': 'puducherry',
'punjab': 'punjab', 'rajshthan': 'rajasthan', 'rajasthan': 'rajasthan',
'orissa': 'odisha', 'pb': 'punjab', 'rj': 'rajasthan',
'bihar': 'bihar', 'jharkhand': 'jharkhand', 'telangana': 'telangana',
'andhra pradesh': 'andhra pradesh', 'arunachal pradesh': 'arunachal pradesh'
}
return state_corrections.get(state, state)
# Normalize the 'ship-state' column
ds_copy['ship-state'] = ds_copy['ship-state'].apply(normalize_state)
# region mapping (in lowercase)
region_mapping = {
'south': ['tamil nadu', 'kerala', 'karnataka', 'andhra pradesh'],
'north': ['delhi', 'haryana', 'punjab', 'uttar pradesh'],
'east': ['west bengal', 'odisha', 'bihar', 'assam', 'jharkhand', 'meghalaya', 'tripura', 'mizoram', 'nagaland'],
'west': ['maharashtra', 'gujarat', 'rajasthan', 'goa'],
'central': ['madhya pradesh', 'chhattisgarh'],
'other': ['chandigarh', 'lakshadweep', 'andaman & nicobar', 'jammu & kashmir', 'sikkim', 'ladakh']
}
# Function to map states to regions
def map_region(state):
for region, states in region_mapping.items():
if state in states:
return region
return 'other' # If state is not in any region, return 'other'
# Apply the region mapping function to the 'ship-state' column
ds_copy['Region'] = ds_copy['ship-state'].apply(map_region)
# Group data by 'Region' and count the entries in each region
grouped_data = ds_copy.groupby('Region').size()
# Output the grouped data
print(grouped_data)
Region central 3078 east 12389 north 21621 other 14432 south 36777 west 27871 dtype: int64
ds_copy.columns.tolist()
['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ', 'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN', 'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city', 'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids', 'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency', 'Ship_Location', 'Ship_Location_Frequency', 'Log_Amount', 'Log_Amount_Adjusted', 'Amount_Category', 'Year', 'Month', 'Day', 'Day_of_Week', 'Is_Weekend', 'num_promotions', 'High_Promotion', 'Price_Per_Unit', 'Region']
ds_copy.Region
1 south
2 west
4 south
5 north
6 other
...
128970 other
128971 north
128972 other
128973 west
128974 central
Name: Region, Length: 116168, dtype: object
#Tableau visualization
d=ds_copy.copy()
# Only necessary columns selected for Tableau visualizations
columns_needed = [
'Date', # For time-based analysis
'Category', # For product segmentation
'Amount', # Revenue
'Price_Per_Unit', # Price point
'ship-service-level', # shipping service level
'Status', # courier status
'Fulfilment', # Fulfillment Method Performance
'Qty', # Quantity sold (demand)
'promotion-ids', # For promotional analysis
'num_promotions', # Number of promotions
'Region', # For geographic insights
'ship-country',
'Day_of_Week', # For seasonal trends
'Is_Weekend', # Weekend impact
'High_Promotion', # Promotion flags
'Year', # Year for trend analysis
'Month', # Month for trend analysis
]
filtered_df = d[columns_needed]
# Save the filtered dataset for Tableau
filtered_file_path = 'Dataset_for_tableau_main_proj.csv'
filtered_df.to_csv(filtered_file_path, index=False)
print(f"Filtered dataset saved to {filtered_file_path}")
Filtered dataset saved to Dataset_for_tableau_main_proj.csv
ds_copy.columns
Index(['index', 'Order ID', 'Date', 'Status', 'Fulfilment', 'Sales Channel ',
'ship-service-level', 'Style', 'SKU', 'Category', 'Size', 'ASIN',
'Courier Status', 'Qty', 'currency', 'Amount', 'ship-city',
'ship-state', 'ship-postal-code', 'ship-country', 'promotion-ids',
'B2B', 'fulfilled-by', 'Size_Aggregated', 'Promotion_IDs_Frequency',
'Ship_Location', 'Ship_Location_Frequency', 'Log_Amount',
'Log_Amount_Adjusted', 'Amount_Category', 'Year', 'Month', 'Day',
'Day_of_Week', 'Is_Weekend', 'num_promotions', 'High_Promotion',
'Price_Per_Unit', 'Region'],
dtype='object')